INSERT OVERWRITE TABLE user_active
SELECT user.*
FROM user
WHERE user.active = 1;
jin-uyu@~$curl http://apache.mirror.cdnetworks.com/hive/hive-0.9.0/hive-0.9.0.tar.gz -o hive-0.9.0.tar.gz
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 28.7M 100 28.7M 0 0 2316k 0 0:00:12 0:00:12 --:--:-- 2678k
#global variables
HADOOP_HOME=/Users/jin-uyu/hadoop-0.20.203.0
HBASE_HOME=/Users/jin-uyu/hbase-0.92.1
HIVE_HOME=/Users/jin-uyu/hive
PATH=$PATH:$HADOOP_HOME/bin:$HBASE_HOME/bin:$HIVE_HOME/bin
export HADOOP_HOME
export HBASE_HOME
export HIVE_HOME
export PATH
jin-uyu@~/hive/conf$cp hive-log4j.properties.template hive-log4j.properties
jin-uyu@~/hive/conf$cp hive-exec-log4j.properties.template hive-exec-log4j.properties
hive.log.dir=/Users/jin-uyu/logs/hive
jin-uyu@~/hadoop/bin$hadoop fs -mkdir /tmp
jin-uyu@~/hadoop/bin$hadoop fs -mkdir /user/hive/warehouse
jin-uyu@~/hadoop/bin$hadoop fs -chmod g+w /tmp
jin-uyu@~/hadoop/bin$hadoop fs -chmod g+w /user/hive/warehouse
Hive 에서는 RDBMS와 비슷하게 여러 함수들을 제공한다.
round, floor, ceil, rand등의 mathematical funtion
그리고, 날짜함수, 조건함수, 문자열 함수 등을 제공할 뿐만 아니라
count, min, max와 같은 Aggregate 함수도 제공한다.(UDAF)
기타 HTML, XML, JSON 을 다룰 수 있는 함수도 있다.
아래 링크로 가면 자세히 확인할 수 있다.
https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF
SELECT length(string_col) FROM table_name;
https://cwiki.apache.org/confluence/display/Hive/HivePlugins
Hive 에서 제공하는 UDF 외에 직접 UDF 를 작성할수도 있다.
package com.example.hive.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
public final class Lower extends UDF {
public Text evaluate(final Text s) {
if (s == null) { return null; }
return new Text(s.toString().toLowerCase());
}
}
create temporary function my_lower as 'com.example.hive.udf.Lower';
hive> select my_lower(title), sum(freq) from titles group by my_lower(title);
package com.example.hive.udaf;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.*;
import java.text.SimpleDateFormat;
public final class ArrayTest extends UDF {
public Text evaluate(List<Text> list) {
if( list == null )
return null;
Collections.sort(list);
if( list.size() > 2 )
return list.get(1);
else
return null;
}
}
jin-uyu@~/hive/bin$hive;
hive> show tables;
OK
Time taken: 0.047 seconds
모든 현재 설정을 보여준다.
hive> SET -v;
hive> CREATE TABLE pokes (foo INT, bar STRING);
OK
Time taken: 0.351 seconds
hive> CREATE TABLE invites (foo INT, bar STRING) PARTITIONED BY (ds STRING);
OK
Time taken: 0.039 seconds
hive> SHOW TABLES;
OK
invites
pokes
Time taken: 0.15 seconds
hive> SHOW TABLES '.*s';
OK
invites
pokes
Time taken: 0.05 seconds
hive> DESCRIBE invites;
OK
foo int
bar string
ds string
Time taken: 0.158 seconds
syntax
LOAD DATA LOCAL INPATH 'local path' INTO TABLE <table_name>
Copies data from client filesystem to HDFS
Be careful
File format should match the CREATE TABLE definition!!
hive> LOAD DATA LOCAL INPATH './examples/files/kv1.txt' OVERWRITE INTO TABLE pokes;
Copying data from file:/Users/jin-uyu/hive-0.9.0/examples/files/kv1.txt
Copying file: file:/Users/jin-uyu/hive-0.9.0/examples/files/kv1.txt
Loading data to table default.pokes
Deleted hdfs://localhost:9100/user/hive/warehouse/pokes
OK
Time taken: 0.439 seconds
hive> SELECT * FROM pokes;
OK
238 val_238
86 val_86
311 val_311
27 val_27
165 val_165
409 val_409
255 val_255
278 val_278
98 val_98
484 val_484
265 val_265
hive> CREATE TABLE invites (foo INT, bar STRING) PARTITIONED BY (ds STRING);
OK
Time taken: 0.05 seconds
hive> DESCRIBE invites;
OK
foo int
bar string
ds string
Time taken: 0.12 seconds
hive> LOAD DATA LOCAL INPATH './examples/files/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2012-08-15');
Copying data from file:/Users/jin-uyu/hive-0.9.0/examples/files/kv2.txt
Copying file: file:/Users/jin-uyu/hive-0.9.0/examples/files/kv2.txt
Loading data to table default.invites partition (ds=2012-08-15)
OK
Time taken: 0.294 seconds
hive> LOAD DATA LOCAL INPATH './examples/files/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2012-08-08');
Copying data from file:/Users/jin-uyu/hive-0.9.0/examples/files/kv2.txt
Copying file: file:/Users/jin-uyu/hive-0.9.0/examples/files/kv2.txt
Loading data to table default.invites partition (ds=2012-08-08)
OK
Time taken: 0.167 seconds
hive> CREATE TABLE invites_back (foo INT, bar STRING) PARTITIONED BY (ds STRING);
OK
Time taken: 0.458 seconds
hive> INSERT OVERWRITE TABLE invites_back SELECT a.* FROM invites a WHERE ds = '2012-08-15';
FAILED: Error in semantic analysis: 1:23 Need to specify partition columns because the destination table is partitioned. Error encountered near token 'invites_back'
hive> INSERT OVERWRITE TABLE invites_back PARTITION(ds='2012-08-15') SELECT a.* FROM invites a WHERE a.ds='2012-08-15';
FAILED: Error in semantic analysis: Line 1:23 Cannot insert into target table because column number/types are different ''2012-08-15'': Table insclause-0 has 2 columns, but query has 3 columns.
hijin-uyu@~/hive$hive ;
WARNING: org.apache.hadoop.metrics.jvm.EventCounter is deprecated. Please use org.apache.hadoop.log.metrics.EventCounter in all the log4j.properties files.
Logging initialized using configuration in file:/Users/jin-uyu/hive-0.9.0/conf/hive-log4j.properties
Hive history file=/tmp/jin-uyu/hive_job_log_jin-uyu_201211242218_484295426.txt
hive> INSERT OVERWRITE TABLE invites_back PARTITION (ds='2012-08-15') SELECT a.foo, a.bar FROM invites a WHERE a.ds='2012-08-15';
Total MapReduce jobs = 2
Launching Job 1 out of 2
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_201211111542_0001, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201211111542_0001
Kill Command = /Users/jin-uyu/hadoop-0.20.203.0/bin/../bin/hadoop job -Dmapred.job.tracker=localhost:9101 -kill job_201211111542_0001
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2012-11-24 22:19:40,086 Stage-1 map = 0%, reduce = 0%
2012-11-24 22:19:46,154 Stage-1 map = 100%, reduce = 0%
2012-11-24 22:19:51,251 Stage-1 map = 100%, reduce = 100%
Ended Job = job_201211111542_0001
Ended Job = -914122630, job is filtered out (removed at runtime).
Moving data to: hdfs://localhost:9100/tmp/hive-jin-uyu/hive_2012-11-24_22-19-19_703_3525511015470568269/-ext-10000
Loading data to table default.invites_back partition (ds=2012-08-15)
Partition default.invites_back{ds=2012-08-15} stats: [num_files: 1, num_rows: 0, total_size: 5791, raw_data_size: 0]
Table default.invites_back stats: [num_partitions: 1, num_files: 1, num_rows: 0, total_size: 5791, raw_data_size: 0]
500 Rows loaded to invites_back
MapReduce Jobs Launched:
Job 0: Map: 1 HDFS Read: 6016 HDFS Write: 5791 SUCCESS
Total MapReduce CPU Time Spent: 0 msec
OK
Time taken: 32.785 seconds

hive> select count(1) from invites_back;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapred.reduce.tasks=<number>
Starting Job = job_201211242253_0001, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201211242253_0001
Kill Command = /Users/jin-uyu/hadoop-0.20.203.0/bin/../bin/hadoop job -Dmapred.job.tracker=localhost:9101 -kill job_201211242253_0001
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
2012-11-24 22:56:13,082 Stage-1 map = 0%, reduce = 0%
2012-11-24 22:56:19,138 Stage-1 map = 100%, reduce = 0%
2012-11-24 22:56:31,225 Stage-1 map = 100%, reduce = 100%
Ended Job = job_201211242253_0001
MapReduce Jobs Launched:
Job 0: Map: 1 Reduce: 1 HDFS Read: 6022 HDFS Write: 4 SUCCESS
Total MapReduce CPU Time Spent: 0 msec
OK
500
Time taken: 41.932 seconds
hive> DROP TABLE invites;
OK
Time taken: 1.156 seconds
hive> ALTER TABLE invites_back RENAME TO invites;
OK
Time taken: 0.652 seconds
hive> show tables;
OK
invites
pokes
Time taken: 0.061 seconds
plan
hive> EXPLAIN select a.* from invites a where foo < 100 order by a.ds;
OK
ABSTRACT SYNTAX TREE:
(TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME invites) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a)))) (TOK_WHERE (< (TOK_TABLE_OR_COL foo) 100)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL a) ds)))))
STAGE DEPENDENCIES:
Stage-1 is a root stage
Stage-0 is a root stage
STAGE PLANS:
Stage: Stage-1
Map Reduce
Alias -> Map Operator Tree:
a
TableScan
alias: a
Filter Operator
predicate:
expr: (foo < 100)
type: boolean
Select Operator
expressions:
expr: foo
type: int
expr: bar
type: string
expr: ds
type: string
outputColumnNames: _col0, _col1, _col2
Reduce Output Operator
key expressions:
expr: _col2
type: string
sort order: +
tag: -1
value expressions:
expr: _col0
type: int
expr: _col1
type: string
expr: _col2
type: string
Reduce Operator Tree:
Extract
File Output Operator
compressed: false
GlobalTableId: 0
table:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
Stage: Stage-0
Fetch Operator
limit: -1
Time taken: 0.061 seconds
활용 가능한 함수 리스트 업
hive> show functions;
1) 하둡을 위한 테이블과 스토리지(메타스토어) 관리 레이어다.
2) Pig, MapReduce, Hive, Streaming과 같이 서로 다른 데이터 처리 도구들이 그리드에서 더 쉽게 데이터를 읽고 쓰게 하고,
하둡에서 사용할 수 있도록 지원한다.
(Streaming은 현재 지원하지 않는다)
3) HCatalog는 테이블 추상화를 통해 HDFS에 있는 데이터를 관계형 뷰로 제공
4) 사용자는 데이터가 어디에 어떤 형태로 저장되어 있는지 신경쓰지 않아도 된다.
(RCFile format, Text File, Sequence File)
1) HCatalog는 하이브 메타스토어와 Hive DDL 컴포넌트 기반으로 작성되어 있다.
2) HCatalog는 Pig, MapReduce 읽고 쓰기 인터페이스와 데이터를 정의하는 Command Line Interface를 제공한다.
(Notification은 이용할 수 없다)
HCatalog를 적용하기 전 아키텍쳐이다.
다음은 HCatalog를 적용한 모습이다.
그리드에서 DB로 데이터를 이동시키고 Hive를 활용하여 분석하는 간단한 예제이다.
HCatolog를 이용하지 않는 케이스와 이용하는 케이스ㄹ를 나눠서 얘기한다.
First Joe in data acquisition uses distcp to get data onto the grid.
hadoop distcp file:///file.dat hdfs://data/rawevents/20100819/data
hcat "alter table rawevents add partition 20100819 hdfs://data/rawevents/20100819/data"
Second Sally in data processing uses Pig to cleanse and prepare the data.
Without HCatalog, Sally must be manually informed by Joe that data is available, or use Oozie and poll on HDFS.
A = load '/data/rawevents/20100819/data' as (alpha:int, beta:chararray, ...);
B = filter A by bot_finder(zeta) = 0;
...
store Z into 'data/processedevents/20100819/data';
With HCatalog, Oozie will be notified by HCatalog data is available and can then start the Pig job
A = load 'rawevents' using HCatLoader;
B = filter A by date = '20100819' and by bot_finder(zeta) = 0;
...
store Z into 'processedevents' using HCatStorer("date=20100819");
Third Robert in client management uses Hive to analyze his clients' results.
Without HCatalog, Robert must alter the table to add the required partition.
alter table processedevents add partition 20100819 hdfs://data/processedevents/20100819/data
select advertiser_id, count(clicks)
from processedevents
where date = '20100819'
group by adverstiser_id;
With HCatalog, Robert does not need to modify the table structure.
select advertiser_id, count(clicks)
from processedevents
where date = '20100819'
group by adverstiser_id;
1) HCatalog와 Hadoop components 접근을 위한 REST-like API를 제공한다.
2) Web Service Interface - Hadoop MapReduce, Pig, Hive, HCatalog DDL
3) HCatalog DDL은 요청 즉시 실행
4) Pig, Hive, MapReduce는 Queue에 있다가 실행
원활한 설치 절차가 없다.
http://people.apache.org/~thejas/templeton_doc_v1/installation.html
소스
https://github.com/hortonworks/templeton
http://yourserver/templeton/v1/resource
where "yourserver" is replaced with your server name, and "resource" is replaced by the Templeton resource name.
For example, to check if the Templeton server is running you could access the following URL:
http://www.myserver.com/templeton/v1/status
http://people.apache.org/~thejas/templeton_doc_v1/resources.html



[STUDY:1] Apache Hive GettingStarted https://cwiki.apache.org/confluence/display/Hive/GettingStarted
[STUDY:2] Apache HIve Language Manual https://cwiki.apache.org/Hive/languagemanual.html
[STUDY:3] HCatalog http://incubator.apache.org/hcatalog/docs/r0.2.0/
[STUDY:4] Future of HCatalog and Templeton http://www.slideshare.net/hortonworks/future-of-hcatalog-hadoop-summit-2012
[STUDY:5] Introduction to HCatalog http://geekdani.wordpress.com/2012/07/11/introduction-to-hcatalog/
[STUDY:6] Templeton 0.1.0 documentation http://people.apache.org/~thejas/templeton_doc_v1/
[STUDY:7] Templeton PDF: ^templeton_docs.pdf
[STUDY:8] Templeton resource list http://people.apache.org/~thejas/templeton_doc_v1/resources.html